Code
import eia_api as api
import eia_data
import pandas as pd
import numpy as np
import requests
import json
import os
import datetime
import plotly.express as px
import plotly.graph_objects as goThe goal of the backfill process is to pull the historical data for the required series using the settings.json file. This includes the following steps:
import eia_api as api
import eia_data
import pandas as pd
import numpy as np
import requests
import json
import os
import datetime
import plotly.express as px
import plotly.graph_objects as goraw_json = open("./settings/settings.json")
meta_json = json.load(raw_json)
series = pd.DataFrame(meta_json["series"])
api_path = meta_json["api_path"]facets_template = {
"parent" : None,
"subba" : None
}
start = datetime.datetime(meta_json["start"]["year"],
meta_json["start"]["month"],
meta_json["start"]["day"],
meta_json["start"]["hour"])
end = datetime.datetime(meta_json["end"]["year"],
meta_json["end"]["month"],
meta_json["end"]["day"],
meta_json["end"]["hour"])
offset = 2250
eia_api_key = os.getenv('EIA_API_KEY')
meta_path = meta_json["meta_path"]
data_path = meta_json["data_path"]
series_mapping_path = meta_json["series_mapping_path"]metadata = api.eia_metadata(api_key = eia_api_key, api_path = api_path)
print(metadata.meta.keys())
print(metadata.meta["startPeriod"])
print(metadata.meta["endPeriod"])dict_keys(['id', 'name', 'description', 'frequency', 'facets', 'data', 'startPeriod', 'endPeriod', 'defaultDateFormat', 'defaultFrequency'])
2019-01-01T00
2024-10-26T07
for i in series.index:
facets = facets_template
facets["parent"] = series.at[i, "parent_id"]
facets["subba"] = series.at[i, "subba_id"]
print(facets)
temp = api.eia_backfill(api_key = eia_api_key,
api_path = api_path+ "data",
facets = facets,
start = start,
end = end,
offset = offset)
ts_obj = pd.DataFrame(np.arange(start = start, stop = end + datetime.timedelta(hours = 1), step = datetime.timedelta(hours = 1)).astype(datetime.datetime), columns=["index"])
ts_obj = ts_obj.merge(temp.data, left_on = "index", right_on = "period", how="left")
ts_obj.drop("period", axis = 1, inplace= True)
ts_obj = ts_obj.rename(columns= {"index": "period"})
missing_index = ts_obj[ts_obj["value"].isnull()].index.tolist()
if len(missing_index) > 0:
ts_obj.loc[missing_index,"subba"] = facets["subba"]
meta_temp = eia_data.create_metadata(data = ts_obj, start = start, end = end, type = "backfill")
meta_temp["index"] = 1
meta_df = pd.DataFrame([meta_temp])
if i == series.index.start:
data = ts_obj
meta = meta_df
else:
data = data._append(ts_obj)
meta = meta._append(meta_df)
data.reset_index(drop=True, inplace=True)
meta.reset_index(drop=True, inplace=True){'parent': 'NYIS', 'subba': 'ZONA'}
{'parent': 'NYIS', 'subba': 'ZONB'}
{'parent': 'NYIS', 'subba': 'ZONC'}
{'parent': 'NYIS', 'subba': 'ZOND'}
{'parent': 'NYIS', 'subba': 'ZONE'}
{'parent': 'NYIS', 'subba': 'ZONF'}
{'parent': 'NYIS', 'subba': 'ZONG'}
{'parent': 'NYIS', 'subba': 'ZONH'}
{'parent': 'NYIS', 'subba': 'ZONI'}
{'parent': 'NYIS', 'subba': 'ZONJ'}
{'parent': 'NYIS', 'subba': 'ZONK'}
print(meta)
# The initial pull has some missing values
print("Missing Values:" , data["value"].isna().sum())
data.head() index parent subba time start \
0 1 NYIS ZONA 2024-10-26 11:23:03.092152+00:00 2022-01-01
1 1 NYIS ZONB 2024-10-26 11:23:13.342822+00:00 2022-01-01
2 1 NYIS ZONC 2024-10-26 11:23:27.939967+00:00 2022-01-01
3 1 NYIS ZOND 2024-10-26 11:23:41.014740+00:00 2022-01-01
4 1 NYIS ZONE 2024-10-26 11:23:54.614723+00:00 2022-01-01
5 1 NYIS ZONF 2024-10-26 11:24:07.774465+00:00 2022-01-01
6 1 NYIS ZONG 2024-10-26 11:24:20.903398+00:00 2022-01-01
7 1 NYIS ZONH 2024-10-26 11:24:33.983207+00:00 2022-01-01
8 1 NYIS ZONI 2024-10-26 11:24:47.645812+00:00 2022-01-01
9 1 NYIS ZONJ 2024-10-26 11:25:02.294246+00:00 2022-01-01
10 1 NYIS ZONK 2024-10-26 11:25:15.390137+00:00 2022-01-01
end start_act end_act start_match end_match \
0 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
1 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
2 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
3 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
4 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
5 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
6 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
7 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
8 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
9 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
10 2024-10-01 01:00:00 2022-01-01 2024-10-01 01:00:00 True True
n_obs na imputed type update success comments
0 24098 1 0 backfill False False Missing values were found;
1 24098 1 0 backfill False False Missing values were found;
2 24098 1 0 backfill False False Missing values were found;
3 24098 1 0 backfill False False Missing values were found;
4 24098 1 0 backfill False False Missing values were found;
5 24098 1 0 backfill False False Missing values were found;
6 24098 1 0 backfill False False Missing values were found;
7 24098 1 0 backfill False False Missing values were found;
8 24098 1 0 backfill False False Missing values were found;
9 24098 1 0 backfill False False Missing values were found;
10 24098 1 0 backfill False False Missing values were found;
Missing Values: 11
| period | subba | subba-name | parent | parent-name | value | value-units | |
|---|---|---|---|---|---|---|---|
| 0 | 2022-01-01 00:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1707.0 | megawatthours |
| 1 | 2022-01-01 01:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1673.0 | megawatthours |
| 2 | 2022-01-01 02:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1644.0 | megawatthours |
| 3 | 2022-01-01 03:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1605.0 | megawatthours |
| 4 | 2022-01-01 04:00:00 | ZONA | West - NYIS | NYIS | New York Independent System Operator | 1550.0 | megawatthours |
def impute_missing(input, var, index):
class imputed_values:
def __init__(self, data, missing_index, num_imputed):
self.data = data
self.missing_index = missing_index
self.num_imputed = num_imputed
input["impute"] = np.NaN
input = input.sort_values(by = [index])
missing_index = input[input[var].isnull()].index.tolist()
non_missing_index = input.index.difference(missing_index).tolist()
num_imputed = 0
for i in missing_index:
if i > 336:
input.loc[i ,"impute"] = (input.loc[i - 336 ,var] + input.loc[i - 168 ,var] + input.loc[i - 24 ,var]) / 3
num_imputed = num_imputed + 1
elif i > 168:
input.loc[i ,"impute"] = (input.loc[i - 168 ,var] + input.loc[i - 24 ,var]) / 2
num_imputed = num_imputed + 1
elif i > 24:
input.loc[i ,"impute"] = input.loc[i - 24 ,var]
num_imputed = num_imputed + 1
else:
print("There are not enough observations to impute observation:", i)
input["y"] = np.NaN
input.loc[missing_index, "y"] = input.loc[missing_index, "impute"]
input.loc[non_missing_index, "y"] = input.loc[non_missing_index, var]
output = imputed_values(data = input, missing_index = missing_index, num_imputed = num_imputed)
return outputdef impute_series(series, meta):
class imputed_series:
def __init__(self, data, metadata):
self.data = data
self.metadata = metadata
new_df = None
for index, row in meta.iterrows():
s = row["subba"]
temp = None
temp = series[series["subba"] == s]
if row["na"] > 0:
print("Series", s, "has", row["na"], "missing values")
imputed = impute_missing(input = temp, var = "value", index = "period")
if imputed.num_imputed > 0:
temp = imputed.data
meta.loc[index, "imputed"] = imputed.num_imputed
meta.loc[index, "comments"] = meta.loc[index, "comments"] + " Missing values were imputed"
else:
temp["impute"] = np.NaN
temp["y"] = temp["value"]
else:
temp["impute"] = np.NaN
temp["y"] = temp["value"]
if all([meta.loc[index, "na"] == meta.loc[index, "imputed"],meta.loc[index, "start_match"],meta.loc[index, "end_match"]]):
meta.loc[index, "success"] = True
meta.loc[index, "update"] = True
if meta.loc[index, "success"]:
if new_df is None:
new_df = temp
else:
new_df = pd.concat([new_df, temp])
if new_df is not None:
output = imputed_series(data = new_df, metadata = meta)
else:
output = None
return outputts = impute_series(series= data, meta = meta)Series ZONA has 1 missing values
Series ZONB has 1 missing values
Series ZONC has 1 missing values
Series ZOND has 1 missing values
Series ZONE has 1 missing values
Series ZONF has 1 missing values
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Series ZONG has 1 missing values
Series ZONH has 1 missing values
Series ZONI has 1 missing values
Series ZONJ has 1 missing values
Series ZONK has 1 missing values
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/tmp/ipykernel_5697/1878324271.py:8: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
We will use Plotly to visualize the series:
fig = go.Figure()
for i in ts.data["subba"].unique():
d = None
d = ts.data[ts.data["subba"] == i]
fig.add_trace(go.Scatter(x=d["period"],
y=d["value"],
name = i,
# line = dict(color = "blue"),
mode='lines'))
fig.add_trace(go.Scatter(
x=ts.data["period"],
y=ts.data["impute"],
mode='markers',
name = "Imputed",
marker=dict(size=4, symbol='square', color = "red")
)
)
fig.update_layout(title = "New York Independent System Operator - Demand for Electricity")
fig